{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# rspb_gola_2020\n",
    "\n",
    "This notebook is a template for how new datasets can be formatted for ingestion into the database.\n",
    "\n",
    "The ideal dataset has both **location** and **sequence** information, in addition to any species or bounding box labels."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Give the path to a JSON file where output from this script will be written to. You can then take this file to the .Net app for ingestion to the database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'rspb_gola_2020'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/rspb_gola_2020'  \n",
    "path_prefix = 'gola-labeled-20201116/Gola Darwin 2020 - humans removed'  # as they are on the container\n",
    "\n",
    "downloaded_dir = '/mink_disk_0/camtraps/rspb_gola_2020/Gola Darwin 2020 - humans removed/'  # AzCopied the container to data disk, with one fewer level of directory\n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database\n",
    "\n",
    "The labels are very neat, contained in four CSVs. \n",
    "- Some entries in the CSVs are not present in blob (probably the people images).\n",
    "- The \"RelativePath\" column is the location. \n",
    "- Sequence info can be extracted from the file names\n",
    "- There isn't a good identifier for each sequence, but the timestamp up to the minute is an okay divider between sequences. It seems that the camera would start numbering from (1) again in a new minute.\n",
    "\n",
    "Images were AzCopied to the data disk first."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14331"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "14282"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files_list = path_utils.recursive_file_list(downloaded_dir, convert_slashes=False)\n",
    "len(files_list)\n",
    "\n",
    "images_set = set([i.split(downloaded_dir)[1] for i in files_list if path_utils.is_image_file(i)])\n",
    "len(images_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D180600/DWCN22/D180600__DWCN22__2020-03-23__00-21-21(3).JPG',\n",
       " 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115575/DWCN16/D115575__DWCN16__2020-03-13__23-29-44(2).JPG',\n",
       " 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D180605/DWCN23/D180605__DWCN23__2020-03-04__20-07-43(5).JPG',\n",
       " 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D170590/DWCN29/D170590__DWCN29__2020-02-27__14-20-35(15).JPG',\n",
       " 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D115540/DWC N 19/D115540__DWC N 19__2020-05-21__05-19-46(45).JPG',\n",
       " 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115570/DWCN15/D115570__DWCN15__2020-03-08__06-00-44(1).JPG',\n",
       " 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D170585/DWCN20/D170585__DWCN20__2020-03-16__18-08-47(5).JPG',\n",
       " 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D110535/DWC N 17/D110535__DWC N 17__2020-06-21__04-58-49(3).JPG',\n",
       " 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D190595/DWCN25/D190595__DWCN25__2017-02-09__14-51-55(4).JPG',\n",
       " 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115560/DWCN14/D115560__DWCN14__2020-03-21__08-53-00(2).JPG']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(images_set)[1000:1010]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the CSV lables from RSPB\n",
    "\n",
    "csv_paths = [os.path.join(downloaded_dir, p) for p in os.listdir(downloaded_dir) if p.endswith('.csv')]\n",
    "len(csv_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9357\n",
      "4252\n",
      "2730\n",
      "3949\n"
     ]
    }
   ],
   "source": [
    "csv_dfs = []\n",
    "for p in csv_paths:\n",
    "    csv = pd.read_csv(p,\n",
    "                     usecols=['File', 'RelativePath', 'Folder', 'Date', 'Time', 'ImageQuality',\n",
    "           'DeleteFlag', 'Category', 'Event', 'SpeciesGroup', 'SpeciesName',\n",
    "           'Count', 'Age', 'Sex', 'Obstruction'])\n",
    "    print(len(csv))\n",
    "    csv_dfs.append(csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_csv = pd.concat(csv_dfs, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['File', 'RelativePath', 'Folder', 'Date', 'Time', 'ImageQuality',\n",
       "       'DeleteFlag', 'Category', 'Event', 'SpeciesGroup', 'SpeciesName',\n",
       "       'Count', 'Age', 'Sex', 'Obstruction'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "(20288, 15)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_csv.columns\n",
    "all_csv.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>File</th>\n",
       "      <th>RelativePath</th>\n",
       "      <th>Folder</th>\n",
       "      <th>Date</th>\n",
       "      <th>Time</th>\n",
       "      <th>ImageQuality</th>\n",
       "      <th>DeleteFlag</th>\n",
       "      <th>Category</th>\n",
       "      <th>Event</th>\n",
       "      <th>SpeciesGroup</th>\n",
       "      <th>SpeciesName</th>\n",
       "      <th>Count</th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Obstruction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19924</th>\n",
       "      <td>D120605__DWC 07__2020-01-29__09-37-00(1).JPG</td>\n",
       "      <td>D120605\\DWC 07</td>\n",
       "      <td>Darwin_CamTrap_Jun_2020_Renamed</td>\n",
       "      <td>29-Jan-2020</td>\n",
       "      <td>09:37:00</td>\n",
       "      <td>Ok</td>\n",
       "      <td>False</td>\n",
       "      <td>Deployment</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>729</th>\n",
       "      <td>D105585__DWCN05__2020-03-12__07-44-23(3).JPG</td>\n",
       "      <td>D105585\\DWCN05</td>\n",
       "      <td>Darwin_CamTrap_Feb_2020_Renamed</td>\n",
       "      <td>12-Mar-2020</td>\n",
       "      <td>07:44:23</td>\n",
       "      <td>Ok</td>\n",
       "      <td>False</td>\n",
       "      <td>Animal</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Bovid</td>\n",
       "      <td>Duiker sp</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4891</th>\n",
       "      <td>D170585__DWCN20__2020-03-21__15-13-52(32).JPG</td>\n",
       "      <td>D170585\\DWCN20</td>\n",
       "      <td>Darwin_CamTrap_Feb_2020_Renamed</td>\n",
       "      <td>21-Mar-2020</td>\n",
       "      <td>15:13:52</td>\n",
       "      <td>Ok</td>\n",
       "      <td>False</td>\n",
       "      <td>Animal</td>\n",
       "      <td>3.0</td>\n",
       "      <td>Bovid</td>\n",
       "      <td>Maxwell's duiker</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                File    RelativePath  \\\n",
       "19924   D120605__DWC 07__2020-01-29__09-37-00(1).JPG  D120605\\DWC 07   \n",
       "729     D105585__DWCN05__2020-03-12__07-44-23(3).JPG  D105585\\DWCN05   \n",
       "4891   D170585__DWCN20__2020-03-21__15-13-52(32).JPG  D170585\\DWCN20   \n",
       "\n",
       "                                Folder         Date      Time ImageQuality  \\\n",
       "19924  Darwin_CamTrap_Jun_2020_Renamed  29-Jan-2020  09:37:00           Ok   \n",
       "729    Darwin_CamTrap_Feb_2020_Renamed  12-Mar-2020  07:44:23           Ok   \n",
       "4891   Darwin_CamTrap_Feb_2020_Renamed  21-Mar-2020  15:13:52           Ok   \n",
       "\n",
       "       DeleteFlag    Category  Event SpeciesGroup       SpeciesName  Count  \\\n",
       "19924       False  Deployment    1.0          NaN               NaN      0   \n",
       "729         False      Animal    1.0        Bovid         Duiker sp      1   \n",
       "4891        False      Animal    3.0        Bovid  Maxwell's duiker      1   \n",
       "\n",
       "       Age  Sex Obstruction  \n",
       "19924  NaN  NaN         NaN  \n",
       "729    NaN  NaN         NaN  \n",
       "4891   NaN  NaN         NaN  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_csv.sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.isnan(all_csv.loc[1, 'SpeciesName'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nan\n",
      "Unknown\n",
      "Duiker sp\n",
      "Squirrel sp\n",
      "Mouse sp\n",
      "Forest giant pouched rat\n",
      "White-bellied pangolin\n",
      "Fire-footed rope squirrel\n",
      "Brush-tailed porcupine\n",
      "Slender-tailed squirrel\n",
      "Western tree hyrax\n",
      "Genet sp\n",
      "Marsh mongoose\n",
      "Sooty mangabey\n",
      "Other (describe in notes)\n",
      "Lesser spot-nosed monkey\n",
      "Red river hog\n",
      "Maxwell's duiker\n",
      "Bongo\n",
      "White-breasted guineafowl\n",
      "Latham's francolin\n",
      "Campbell's monkey\n",
      "Bushbuck\n",
      "African civet\n",
      "Common cusimanse\n",
      "Mongoose sp\n",
      "Black duiker\n",
      "Western chimpanzee\n",
      "African giant squirrel\n",
      "Ornate monitor\n",
      "Jentink's duiker\n",
      "Bay duiker\n",
      "Greater cane-rat (Marsh cane rat)\n",
      "Ogilby's duiker\n",
      "Nkulengu Rail\n",
      "Honey badger\n",
      "African palm civet\n",
      "Johnston's genet\n",
      "African forest buffalo\n",
      "Ichneumon Mongoose\n",
      "Timneh parrot\n",
      "Crested guineafowl\n",
      "Galago sp.\n"
     ]
    }
   ],
   "source": [
    "for i in pd.unique(all_csv['SpeciesName']):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nan\n",
      "Primate\n",
      "Bovid\n",
      "Unidentified\n",
      "Squirrel\n",
      "Non-squirrel rodent\n",
      "Bird\n",
      "Pangolin\n",
      "Hyrax\n",
      "Carnivore\n",
      "Pig\n",
      "Other\n",
      "None\n",
      "Antelope\n",
      "Mustelid\n"
     ]
    }
   ],
   "source": [
    "for i in pd.unique(all_csv['SpeciesGroup']):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "20288it [00:03, 6473.26it/s]\n"
     ]
    }
   ],
   "source": [
    "embedded = []\n",
    "seq_to_loc = {}\n",
    "images_missing = []\n",
    "num_images = 0\n",
    "\n",
    "for i_row, row in tqdm(all_csv.iterrows()):\n",
    "    \n",
    "    rel_path = row['RelativePath'].replace('\\\\', '/')\n",
    "    \n",
    "    file = os.path.join(row['Folder'] + '_peopleremoved', rel_path, row['File'])\n",
    "    \n",
    "    if file not in images_set:\n",
    "        images_missing.append(file)\n",
    "        continue\n",
    "    \n",
    "    seq_id = row['File'].split('(')[0][:-3] # use the file name up to the seconds\n",
    "    frame_num = int(row['File'].split('(')[1].split(')')[0])\n",
    "    \n",
    "    datetime = row['Date'] + ' ' + row['Time']\n",
    "    \n",
    "    # default value for class is the coarse Category - Deployment, Empty, Animal, Collection\n",
    "    clss = None\n",
    "    if isinstance(row['Category'], str) and row['Category'] == 'Empty':\n",
    "        clss = 'empty'\n",
    "    elif isinstance(row['SpeciesName'], str):\n",
    "        clss = row['SpeciesName'].lower()\n",
    "        \n",
    "    if clss is None and isinstance(row['SpeciesGroup'], str) and row['SpeciesGroup'] != None:\n",
    "        clss = row['SpeciesGroup'].lower()\n",
    "        \n",
    "    if clss.startswith('other'):\n",
    "        clss = 'other'\n",
    "    if '(' in clss:\n",
    "        clss = clss.split('(')[0].strip()\n",
    "    assert clss is not None\n",
    "    \n",
    "    embedded.append({\n",
    "        'file': file,\n",
    "        'class': [clss],  # only one species per image it seems\n",
    "        'species_group': row['SpeciesGroup'].lower() if isinstance(row['SpeciesGroup'], str) else None,\n",
    "        'frame_num': frame_num,\n",
    "        'datetime': datetime,\n",
    "        'count': row['Count'],\n",
    "        'age': row['Age'],\n",
    "        'sex': row['Sex'],\n",
    "        'obstruction': row['Obstruction'],\n",
    "        'location': rel_path,\n",
    "        'seq_id': seq_id\n",
    "    })\n",
    "    seq_to_loc[seq_id] = rel_path\n",
    "    num_images += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2509"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(seq_to_loc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "75"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of locations\n",
    "len(set(seq_to_loc.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6006"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105580/DWCN19/D105580__DWCN19__2020-02-15__10-14-35(1).JPG'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "14282"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(images_missing)\n",
    "images_missing[100]\n",
    "\n",
    "num_images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to rspb_gola_2020. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 14282/14282 [00:00<00:00, 983145.41it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 14282 images into sequences...\n",
      "Number of sequences: 2509\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'count', 'location', 'datetime', 'class', 'species_group', 'age', 'sex', 'frame_num', 'obstruction', 'file'}\n",
      "\n",
      "img_level_properties\n",
      "{'count', 'datetime', 'class', 'species_group', 'age', 'frame_num', 'obstruction', 'file'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'sex', 'location'}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{\"dataset\": \"rspb_gola_2020\", \"seq_id\": \"D100595__DWCN08__2020-02-18__17-46\", \"location\": \"D100595/DWCN08\", \"images\": [{\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D100595/DWCN08/D100595__DWCN08__2020-02-18__17-46-03(1).JPG\", \"class\": [\"unknown\"], \"species_group\": \"primate\", \"frame_num\": 1, \"datetime\": \"18-Feb-2020 17:46:03\", \"count\": 2, \"age\": null, \"obstruction\": \"Yes\"}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D100595/DWCN08/D100595__DWCN08__2020-02-18__17-46-03(2).JPG\", \"class\": [\"unknown\"], \"species_group\": \"primate\", \"frame_num\": 2, \"datetime\": \"18-Feb-2020 17:46:03\", \"count\": 2, \"age\": null, \"obstruction\": \"Yes\"}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D100595/DWCN08/D100595__DWCN08__2020-02-18__17-46-03(3).JPG\", \"class\": [\"unknown\"], \"species_group\": \"primate\", \"frame_num\": 3, \"datetime\": \"18-Feb-2020 17:46:03\", \"count\": 2, \"age\": null, \"obstruction\": \"Yes\"}], \"sex\": null}\n",
      "\n",
      "{\"dataset\": \"rspb_gola_2020\", \"seq_id\": \"D105600__DWCN07__2020-02-25__06-31\", \"location\": \"D105600/DWCN07\", \"images\": [{\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-39(1).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 1, \"datetime\": \"25-Feb-2020 06:31:39\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-39(2).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 2, \"datetime\": \"25-Feb-2020 06:31:39\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-39(3).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 3, \"datetime\": \"25-Feb-2020 06:31:39\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-41(4).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 4, \"datetime\": \"25-Feb-2020 06:31:41\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-41(5).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 5, \"datetime\": \"25-Feb-2020 06:31:41\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-41(6).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 6, \"datetime\": \"25-Feb-2020 06:31:41\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-42(7).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 7, \"datetime\": \"25-Feb-2020 06:31:42\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-42(8).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 8, \"datetime\": \"25-Feb-2020 06:31:42\", \"count\": 1, \"age\": null, \"obstruction\": null}, {\"file\": \"Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-25__06-31-42(9).JPG\", \"class\": [\"genet sp\"], \"species_group\": \"carnivore\", \"frame_num\": 9, \"datetime\": \"25-Feb-2020 06:31:42\", \"count\": 1, \"age\": null, \"obstruction\": null}], \"sex\": null}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sequences = process_sequences(embedded, dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D115580__DWCN18__2020-03-13__00-49'),\n",
       "              ('location', 'D115580/DWCN18'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115580/DWCN18/D115580__DWCN18__2020-03-13__00-49-38(1).JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '13-Mar-2020 00:49:38',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115580/DWCN18/D115580__DWCN18__2020-03-13__00-49-38(2).JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '13-Mar-2020 00:49:38',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115580/DWCN18/D115580__DWCN18__2020-03-13__00-49-38(3).JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '13-Mar-2020 00:49:38',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D155625__DWC N 38__2020-04-03__15-01'),\n",
       "              ('location', 'D155625/DWC N 38'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D155625/DWC N 38/D155625__DWC N 38__2020-04-03__15-01-12(1).JPG',\n",
       "                 'class': ['sooty mangabey'],\n",
       "                 'species_group': 'primate',\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '03-Apr-2020 15:01:12',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D155625/DWC N 38/D155625__DWC N 38__2020-04-03__15-01-12(2).JPG',\n",
       "                 'class': ['sooty mangabey'],\n",
       "                 'species_group': 'primate',\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '03-Apr-2020 15:01:12',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D155625/DWC N 38/D155625__DWC N 38__2020-04-03__15-01-12(3).JPG',\n",
       "                 'class': ['sooty mangabey'],\n",
       "                 'species_group': 'primate',\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '03-Apr-2020 15:01:12',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D145630__N6__2019-12-25__16-30'),\n",
       "              ('location', 'D145630/N6'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Dec_2019_Renamed_peopleremoved/D145630/N6/D145630__N6__2019-12-25__16-30-00(1).JPG',\n",
       "                 'class': ['unknown'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '25-Dec-19 16:30:00',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D175595__DWCN21__2020-03-01__18-27'),\n",
       "              ('location', 'D175595/DWCN21'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D175595/DWCN21/D175595__DWCN21__2020-03-01__18-27-33(1).JPG',\n",
       "                 'class': ['fire-footed rope squirrel'],\n",
       "                 'species_group': 'squirrel',\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '01-Mar-2020 18:27:33',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D175595/DWCN21/D175595__DWCN21__2020-03-01__18-27-33(2).JPG',\n",
       "                 'class': ['fire-footed rope squirrel'],\n",
       "                 'species_group': 'squirrel',\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '01-Mar-2020 18:27:33',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D175595/DWCN21/D175595__DWCN21__2020-03-01__18-27-33(3).JPG',\n",
       "                 'class': ['fire-footed rope squirrel'],\n",
       "                 'species_group': 'squirrel',\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '01-Mar-2020 18:27:33',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D175595/DWCN21/D175595__DWCN21__2020-03-01__18-27-35(4).JPG',\n",
       "                 'class': ['fire-footed rope squirrel'],\n",
       "                 'species_group': 'squirrel',\n",
       "                 'frame_num': 4,\n",
       "                 'datetime': '01-Mar-2020 18:27:35',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D175595/DWCN21/D175595__DWCN21__2020-03-01__18-27-35(5).JPG',\n",
       "                 'class': ['fire-footed rope squirrel'],\n",
       "                 'species_group': 'squirrel',\n",
       "                 'frame_num': 5,\n",
       "                 'datetime': '01-Mar-2020 18:27:35',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D175595/DWCN21/D175595__DWCN21__2020-03-01__18-27-35(6).JPG',\n",
       "                 'class': ['fire-footed rope squirrel'],\n",
       "                 'species_group': 'squirrel',\n",
       "                 'frame_num': 6,\n",
       "                 'datetime': '01-Mar-2020 18:27:35',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D105600__DWCN07__2020-02-29__01-29'),\n",
       "              ('location', 'D105600/DWCN07'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-29__01-29-43(1).JPG',\n",
       "                 'class': ['mouse sp'],\n",
       "                 'species_group': 'non-squirrel rodent',\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '29-Feb-2020 01:29:43',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-29__01-29-44(2).JPG',\n",
       "                 'class': ['mouse sp'],\n",
       "                 'species_group': 'non-squirrel rodent',\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '29-Feb-2020 01:29:44',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105600/DWCN07/D105600__DWCN07__2020-02-29__01-29-44(3).JPG',\n",
       "                 'class': ['mouse sp'],\n",
       "                 'species_group': 'non-squirrel rodent',\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '29-Feb-2020 01:29:44',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D135565__PH16__2020-01-09__05-35'),\n",
       "              ('location', 'D135565/PH16'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Dec_2019_Renamed_peopleremoved/D135565/PH16/D135565__PH16__2020-01-09__05-35-36(1).JPG',\n",
       "                 'class': ['brush-tailed porcupine'],\n",
       "                 'species_group': 'non-squirrel rodent',\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '09-Jan-20 05:35:36',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Dec_2019_Renamed_peopleremoved/D135565/PH16/D135565__PH16__2020-01-09__05-35-37(2).JPG',\n",
       "                 'class': ['brush-tailed porcupine'],\n",
       "                 'species_group': 'non-squirrel rodent',\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '09-Jan-20 05:35:37',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Dec_2019_Renamed_peopleremoved/D135565/PH16/D135565__PH16__2020-01-09__05-35-37(3).JPG',\n",
       "                 'class': ['brush-tailed porcupine'],\n",
       "                 'species_group': 'non-squirrel rodent',\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '09-Jan-20 05:35:37',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D015450__DWC N 19__2020-07-31__20-48'),\n",
       "              ('location', 'D015450/DWC N 19'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-02(1).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '31-Jul-2020 20:48:02',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-02(2).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '31-Jul-2020 20:48:02',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-02(3).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '31-Jul-2020 20:48:02',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-09(4).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 4,\n",
       "                 'datetime': '31-Jul-2020 20:48:09',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-10(5).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 5,\n",
       "                 'datetime': '31-Jul-2020 20:48:10',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-10(6).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 6,\n",
       "                 'datetime': '31-Jul-2020 20:48:10',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-17(7).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 7,\n",
       "                 'datetime': '31-Jul-2020 20:48:17',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-17(8).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 8,\n",
       "                 'datetime': '31-Jul-2020 20:48:17',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-17(9).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 9,\n",
       "                 'datetime': '31-Jul-2020 20:48:17',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-25(10).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 10,\n",
       "                 'datetime': '31-Jul-2020 20:48:25',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-25(11).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 11,\n",
       "                 'datetime': '31-Jul-2020 20:48:25',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-25(12).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 12,\n",
       "                 'datetime': '31-Jul-2020 20:48:25',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-34(13).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 13,\n",
       "                 'datetime': '31-Jul-2020 20:48:34',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-34(14).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 14,\n",
       "                 'datetime': '31-Jul-2020 20:48:34',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-34(15).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 15,\n",
       "                 'datetime': '31-Jul-2020 20:48:34',\n",
       "                 'count': 6,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-42(16).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 16,\n",
       "                 'datetime': '31-Jul-2020 20:48:42',\n",
       "                 'count': 7,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-42(17).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 17,\n",
       "                 'datetime': '31-Jul-2020 20:48:42',\n",
       "                 'count': 7,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-42(18).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 18,\n",
       "                 'datetime': '31-Jul-2020 20:48:42',\n",
       "                 'count': 7,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-52(19).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 19,\n",
       "                 'datetime': '31-Jul-2020 20:48:52',\n",
       "                 'count': 3,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-52(20).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 20,\n",
       "                 'datetime': '31-Jul-2020 20:48:52',\n",
       "                 'count': 3,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D015450/DWC N 19/D015450__DWC N 19__2020-07-31__20-48-52(21).JPG',\n",
       "                 'class': ['white-breasted guineafowl'],\n",
       "                 'species_group': 'bird',\n",
       "                 'frame_num': 21,\n",
       "                 'datetime': '31-Jul-2020 20:48:52',\n",
       "                 'count': 3,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D020455__DWC N 30__2020-08-14__06-40'),\n",
       "              ('location', 'D020455/DWC N 30'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D020455/DWC N 30/D020455__DWC N 30__2020-08-14__06-40-00(1).JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '14-Aug-2020 06:40:00',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D020455/DWC N 30/D020455__DWC N 30__2020-08-14__06-40-00(2).JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '14-Aug-2020 06:40:00',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D020455/DWC N 30/D020455__DWC N 30__2020-08-14__06-40-00(3).JPG',\n",
       "                 'class': ['empty'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '14-Aug-2020 06:40:00',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D145630__N6__2019-12-22__16-05'),\n",
       "              ('location', 'D145630/N6'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Dec_2019_Renamed_peopleremoved/D145630/N6/D145630__N6__2019-12-22__16-05-00(1).JPG',\n",
       "                 'class': ['unknown'],\n",
       "                 'species_group': None,\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '22-Dec-19 16:05:00',\n",
       "                 'count': 0,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)]),\n",
       " OrderedDict([('dataset', 'rspb_gola_2020'),\n",
       "              ('seq_id', 'D185605__DWCN30__2020-03-13__04-51'),\n",
       "              ('location', 'D185605/DWCN30'),\n",
       "              ('images',\n",
       "               [{'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D185605/DWCN30/D185605__DWCN30__2020-03-13__04-51-06(1).JPG',\n",
       "                 'class': ['genet sp'],\n",
       "                 'species_group': 'carnivore',\n",
       "                 'frame_num': 1,\n",
       "                 'datetime': '13-Mar-2020 04:51:06',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D185605/DWCN30/D185605__DWCN30__2020-03-13__04-51-07(2).JPG',\n",
       "                 'class': ['genet sp'],\n",
       "                 'species_group': 'carnivore',\n",
       "                 'frame_num': 2,\n",
       "                 'datetime': '13-Mar-2020 04:51:07',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D185605/DWCN30/D185605__DWCN30__2020-03-13__04-51-07(3).JPG',\n",
       "                 'class': ['genet sp'],\n",
       "                 'species_group': 'carnivore',\n",
       "                 'frame_num': 3,\n",
       "                 'datetime': '13-Mar-2020 04:51:07',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D185605/DWCN30/D185605__DWCN30__2020-03-13__04-51-13(4).JPG',\n",
       "                 'class': ['genet sp'],\n",
       "                 'species_group': 'carnivore',\n",
       "                 'frame_num': 4,\n",
       "                 'datetime': '13-Mar-2020 04:51:13',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D185605/DWCN30/D185605__DWCN30__2020-03-13__04-51-13(5).JPG',\n",
       "                 'class': ['genet sp'],\n",
       "                 'species_group': 'carnivore',\n",
       "                 'frame_num': 5,\n",
       "                 'datetime': '13-Mar-2020 04:51:13',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None},\n",
       "                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D185605/DWCN30/D185605__DWCN30__2020-03-13__04-51-13(6).JPG',\n",
       "                 'class': ['genet sp'],\n",
       "                 'species_group': 'carnivore',\n",
       "                 'frame_num': 6,\n",
       "                 'datetime': '13-Mar-2020 04:51:13',\n",
       "                 'count': 1,\n",
       "                 'age': None,\n",
       "                 'obstruction': None}]),\n",
       "              ('sex', None)])]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(sequences, 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check\n",
    "\n",
    "Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.\n",
    "\n",
    "If the format conforms, the following messages will be printed:\n",
    "\n",
    "```\n",
    "Verified that the sequence items meet requirements not captured by the schema.\n",
    "Verified that the sequence items conform to the schema.\n",
    "```\n",
    "\n",
    "For large datasets, the second step will take some time (~ a minute). \n",
    "\n",
    "Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 1.1 s, sys: 3.66 ms, total: 1.11 s\n",
      "Wall time: 1.11 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - copy images to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2509/2509 [00:00<00:00, 25779.33it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 78 ms, sys: 24.5 ms, total: 102 ms\n",
      "Wall time: 100 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "path_pairs = []\n",
    "for seq in tqdm(sequences):\n",
    "    seq_id = seq['seq_id']\n",
    "    for im in seq['images']:\n",
    "        \n",
    "        if 'empty' not in im['class']:\n",
    "        \n",
    "            src_path = os.path.join(downloaded_dir, im['file'])\n",
    "            assert os.path.exists(src_path), src_path\n",
    "            frame = im['frame_num']\n",
    "            dst_path = os.path.join('/mink_disk_0/camtraps/imerit12b', \n",
    "                                    f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "            path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12985"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/rspb_gola_2020/Gola Darwin 2020 - humans removed/Darwin_CamTrap_Dec_2019_Renamed_peopleremoved/D145580/N4/D145580__N4__2019-12-17__09-58-47(3).JPG',\n",
       " '/mink_disk_0/camtraps/imerit12b/rspb_gola_2020.seqD145580__N4__2019-12-17__09-58.frame3.jpg')"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)  # non-empty images out of total of 14282 (90%)\n",
    "path_pairs[10000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 23.2 s, sys: 48.7 s, total: 1min 11s\n",
      "Wall time: 34.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "with ThreadPool(8) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
